Raw data quality control
Initialization
We start the analysis by initializing the packages required for all the analysis performed in this section. We also define the root directory, within which all the input/output operations for this project will be performed. At the end of this document, detailed software version information is provided for easier reproducibility of the analysis.
library(reshape2)
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(lsr)
library(plotly)
library(DT)
library(WriteXLS)
library(ggallin)
path = "/Users/ashwin/Documents/Projects/YeastScreen/Nonessential_MATa_screen/"Raw data summary
We show below the summary for the raw screening data - Yeast Non-essential MATa library.
rawDat = readRDS(paste0(path, "data/workspaces/YeastRedox_MATa_RawData.RDS"))
rawDat$Signal_485_520 = log10(rawDat$Signal_485_520)
rawDat$Signal_400_520 = log10(rawDat$Signal_400_520)
summary(rawDat) Plate Well_96 Well Group
102 : 4512 E10 : 2496 Q13 : 156 AZ : 2496
105 : 4512 E4 : 2496 Q14 : 156 BE : 2496
108 : 4512 E9 : 2496 Q15 : 156 BF : 2496
109 : 4512 A1 : 2448 Q16 : 156 A : 2448
110 : 4512 A10 : 2448 Q33 : 156 AB : 2448
112 : 4512 A11 : 2448 Q34 : 156 AD : 2448
(Other):200736 (Other):212976 (Other):226872 (Other):212976
Content SystematicName SGD.ID Gene.Symbol
Blank :56952 Length:227808 Length:227808 Length:227808
Control :56952 Class :character Class :character Class :character
Cytoplasm :56952 Mode :character Mode :character Mode :character
Mitochondria:56952
roGFP2.ratio Signal_485_520 Signal_400_520 Type
Min. :-7491.00 Min. :3.170 Min. :3.819 Glucose :75936
1st Qu.: 0.36 1st Qu.:4.197 1st Qu.:4.834 Galactose:75936
Median : 0.49 Median :4.683 Median :4.903 Glycerol :75936
Mean : 0.48 Mean :4.574 Mean :4.918
3rd Qu.: 0.64 3rd Qu.:4.926 3rd Qu.:5.012
Max. : 1122.00 Max. :5.415 Max. :5.415
NA's :56952
Background Fluorescence
Firstly, we look at the distribution of background flourescence across all plates and retain only those colonies for Control, Mitochondria and Cytoplasm whose fluorescence signal exceeds the 90th percentile of blank 485-520 and 400-520 fluorescence signals
tmpdat = rawDat[, c("Plate", "Content", "Signal_485_520", "Signal_400_520", "Type")]
tmpdat = melt(tmpdat)
a = droplevels(tmpdat[tmpdat$variable == "Signal_485_520" & tmpdat$Content == "Blank",
])
per90_485_520 = lapply(split(a, a$Plate), function(x) split(x, x$Type))
per90_485_520 = sapply(per90_485_520, function(x) sapply(x, function(y) quantile(y$value,
0.9)))
per90_485_520 = melt(t(per90_485_520))
per90_485_520$Var2 = gsub(".90%", "", per90_485_520$Var2)
colnames(per90_485_520) = c("Plate", "Type", "value")
a = droplevels(tmpdat[tmpdat$variable == "Signal_400_520" & tmpdat$Content == "Blank",
])
per90_400_520 = lapply(split(a, a$Plate), function(x) split(x, x$Type))
per90_400_520 = sapply(per90_400_520, function(x) sapply(x, function(y) quantile(y$value,
0.9)))
per90_400_520 = melt(t(per90_400_520))
per90_400_520$Var2 = gsub(".90%", "", per90_400_520$Var2)
colnames(per90_400_520) = c("Plate", "Type", "value")
# Filtering out colonies
if (identical(paste0(per90_400_520$Plate, per90_400_520$Type), paste0(per90_485_520$Plate,
per90_485_520$Type))) {
cutoff = data.frame(per90_400_520[, 1:2], per90_400_520 = per90_400_520$value,
per90_485_520 = per90_485_520$value, stringsAsFactors = F)
}
rawDatFilt = lapply(split(rawDat, rawDat$Plate), function(x) split(x, x$Type))
for (i in 1:nrow(cutoff)) {
x = rawDatFilt[[as.character(cutoff$Plate[i])]][[cutoff$Type[i]]]
x = x[x$Signal_485_520 > cutoff$per90_485_520[i] & x$Signal_400_520 > cutoff$per90_400_520[i],
]
rawDatFilt[[as.character(cutoff$Plate[i])]][[cutoff$Type[i]]] = x
}
rawDatFilt = do.call("rbind", lapply(rawDatFilt, function(x) do.call("rbind", x)))
# Plot background fluorescence distributions
p1 = ggplot(tmpdat, aes(x = value, fill = Content)) + geom_density(alpha = 0.5) +
theme_bw(base_size = 8) + labs(subtitle = "Non-essential MATa screen", x = "Fluorescence signal (in log10)",
y = "Density", fill = "") + facet_wrap(variable ~ Type, scales = "free", ncol = 3) +
theme(legend.position = "right", panel.grid = element_blank())
p2 = ggplot(tmpdat[tmpdat$variable == "Signal_485_520", ], aes(x = value, fill = Content)) +
geom_density(alpha = 0.5) + theme_bw(base_size = 8) + labs(subtitle = "Non-essential MATa screen | Signal_485_520",
x = "Fluorescence signal (in log10)", y = "Density", fill = "") + geom_vline(data = per90_485_520,
aes(xintercept = value)) + facet_wrap(Plate ~ Type, scales = "free", ncol = 3) +
theme(legend.position = "right", panel.grid = element_blank())
p3 = ggplot(tmpdat[tmpdat$variable == "Signal_400_520", ], aes(x = value, fill = Content)) +
geom_density(alpha = 0.5) + theme_bw(base_size = 8) + labs(subtitle = "Non-essential MATa screen | Signal_400_520",
x = "Fluorescence signal (in log10)", y = "Density", fill = "") + geom_vline(data = per90_400_520,
aes(xintercept = value)) + facet_wrap(Plate ~ Type, scales = "free", ncol = 3) +
theme(legend.position = "right", panel.grid = element_blank())
ggsave(filename = paste0(path, "analysis/QC/background_fluorescence.pdf"), plot = p1,
width = 7, height = 3.5)
ggsave(filename = paste0(path, "analysis/QC/background_fluorescence_Signal_485_520.pdf"),
plot = p2, width = 7, height = 60, limitsize = F)
ggsave(filename = paste0(path, "analysis/QC/background_fluorescence_Signal_400_520.pdf"),
plot = p3, width = 7, height = 60, limitsize = F)
p1[1] "total number of colonies in the raw data - 227808"
paste0("total number of colonies in the background flourescence filtered data - ",
nrow(rawDatFilt))[1] "total number of colonies in the background flourescence filtered data - 127459"
[1] "% of colonies KEPT - 55.95"
Cleaning raw data
Firstly we read the raw yeast redox screen data and remove outlier (extreme values) and NA. Typically, outlier removal is not considered good data analysis practice, however, we do it here because we know that the extreme values comes from technical artifacts. For example, roGFP2 ratios cannot be negative and those which are significantly higher than the plate average are ambiguous signals.
Outliers were detected as -
- upper outlier values > 75th quantile value for a plate + 3 * plate IQR value
- lower outlier values < 25th quantile value for a plate - 3 * plate IQR value
The outlier values are all set to NA and values greater than the lower bound threshold but less than 0 were set to pseudo minimum value of 0.0001 as roGFP2 ratios can’t be less than 0.
We show these extreme outliers below, ploting the distribution of values from each plate.
tmpdat = na.omit(rawDat)
ggplot(tmpdat, aes(x = roGFP2.ratio, color = Plate)) + geom_density() + theme_bw(base_size = 8) +
labs(x = "roGFP2 ratio (in log10)") + scale_x_continuous(trans = pseudolog10_trans,
breaks = c(-50, -2:5, 50)) + facet_wrap(Content ~ Type, scales = "free") + theme(legend.position = "none",
panel.grid = element_blank())rawDatCleaned = vector("list", length = 3)
names(rawDatCleaned) = c("Glucose", "Galactose", "Glycerol")
for (i in names(rawDatCleaned)) {
dat = rawDat[rawDat$Type == i, ]
dat = split(dat, dat$Plate)
dat = lapply(dat, function(x) {
vals = x$roGFP2.ratio
lb = quantile(vals, probs = 0.25, na.rm = T) - (3 * IQR(vals, na.rm = T))
ub = quantile(vals, probs = 0.75, na.rm = T) + (3 * IQR(vals, na.rm = T))
vals[vals < lb | vals > ub] = NA
vals[vals > lb & vals < 0] = 10^-4
x$roGFP2.ratio = vals
x = x[!is.na(x$roGFP2.ratio), ]
return(x)
})
rawDatCleaned[[i]] = do.call("rbind", dat)
rm(dat)
}
rawDatCleaned = do.call("rbind", rawDatCleaned)
rawDatCleaned = droplevels(rawDatCleaned)
rownames(rawDatCleaned) = 1:nrow(rawDatCleaned)Next, lets check the summaries of the raw roGFP2 redox screen data and the cleaned raw data for comparison.
Raw data looked like -
Plate Well_96 Well Group
104 : 3418 A12 : 1943 O48 : 152 L : 1943
102 : 3257 H12 : 1934 P48 : 151 CR : 1934
107 : 3242 D12 : 1867 S48 : 151 AV : 1867
142 : 3103 A11 : 1866 S47 : 150 K : 1866
116 : 2903 E12 : 1843 T48 : 150 BH : 1843
101 : 2895 C12 : 1829 A13 : 149 AJ : 1829
(Other):108641 (Other):116177 (Other):126556 (Other):116177
Content SystematicName SGD.ID Gene.Symbol
Blank : 3743 Length:127459 Length:127459 Length:127459
Control :35277 Class :character Class :character Class :character
Cytoplasm :40939 Mode :character Mode :character Mode :character
Mitochondria:47500
roGFP2.ratio Signal_485_520 Signal_400_520 Type
Min. :-331.890 Min. :3.920 Min. :4.260 Glucose :43926
1st Qu.: 0.390 1st Qu.:4.710 1st Qu.:4.905 Galactose:43449
Median : 0.508 Median :4.894 Median :4.986 Glycerol :40084
Mean : 0.529 Mean :4.840 Mean :4.998
3rd Qu.: 0.667 3rd Qu.:5.019 3rd Qu.:5.078
Max. : 78.548 Max. :5.415 Max. :5.415
NA's :3743
Cleaned data looked like -
Plate Well_96 Well Group
104 : 3333 D12 : 1736 O48 : 152 AV : 1736
102 : 3162 C12 : 1710 S48 : 151 AJ : 1710
107 : 3140 E12 : 1710 P48 : 150 BH : 1710
142 : 3004 A12 : 1703 S47 : 150 CF : 1703
116 : 2834 G12 : 1703 T48 : 150 L : 1703
118 : 2833 H12 : 1698 A13 : 149 CR : 1698
(Other):104714 (Other):112760 (Other):122118 (Other):112760
Content SystematicName SGD.ID Gene.Symbol
Control :35133 Length:123020 Length:123020 Length:123020
Cytoplasm :40772 Class :character Class :character Class :character
Mitochondria:47115 Mode :character Mode :character Mode :character
roGFP2.ratio Signal_485_520 Signal_400_520 Type
Min. :0.0001 Min. :3.938 Min. :4.725 Glucose :42333
1st Qu.:0.3907 1st Qu.:4.733 1st Qu.:4.906 Galactose:41968
Median :0.5079 Median :4.902 Median :4.988 Glycerol :38719
Mean :0.5356 Mean :4.858 Mean :5.000
3rd Qu.:0.6647 3rd Qu.:5.023 3rd Qu.:5.079
Max. :5.1000 Max. :5.415 Max. :5.415
Percentage of data removed after outlier removal -
[1] 3.48
Distribution of raw roGFP2 ratios
Distribution of raw roGFP2 ratios (absolute ratios) per plate
p = ggplot(rawDatCleaned) + theme_bw(base_size = 8) + geom_boxplot(aes(x = Plate,
y = roGFP2.ratio), outlier.size = 0.1, lwd = 0.2) + facet_grid(Content ~ Type,
scales = "free") + theme(axis.text.x = element_text(angle = 90, hjust = 0.5,
vjust = 0.5))
ggsave(filename = paste0(path, "analysis/QC/roGFP2_ratio_RAWdata_distribution_nutrient_compartments.pdf"),
plot = p, width = 12, height = 5)
pIdentification of edge effects
A well known technical issue with yeast mutant screens is the altered growth characteristics of yeast colonies growing at the edge of plates. Next, we look at how roGFP2 ratios are different between edge wells and non-edge wells.
These are the edge wells and the mutants originating from these locations should be carefully analyzed.
edge = as.character(unlist(read.csv(paste0(path, "data/annotations/edge_wells.txt"))))
# Print edge wells
edge [1] "I01" "Q01" "Y01" "A03" "D02" "F04" "I03" "L02" "N04" "Q03" "T02" "V04"
[13] "Y03" "b02" "d04" "A09" "A17" "A25" "A33" "A41" "B05" "B13" "B21" "B29"
[25] "B37" "B45" "C09" "C17" "C25" "C33" "C41" "D05" "D13" "D21" "D29" "D37"
[37] "D45" "F45" "H45" "J45" "L45" "N45" "P45" "R45" "T45" "V45" "X45" "Z45"
[49] "b05" "b13" "b21" "b29" "b37" "b45" "c09" "c17" "c25" "c33" "c41" "d05"
[61] "d13" "d21" "d29" "d37" "d45" "e09" "e17" "e25" "e33" "e41" "J01" "R01"
[73] "Z01" "A04" "D03" "G02" "I04" "L03" "O02" "Q04" "T03" "W02" "Y04" "b03"
[85] "e02" "A10" "A18" "A26" "A34" "A42" "B06" "B14" "B22" "B30" "B38" "B46"
[97] "C10" "C18" "C26" "C34" "C42" "D06" "D14" "D22" "D30" "D38" "D46" "F46"
[109] "H46" "J46" "L46" "N46" "P46" "R46" "T46" "V46" "X46" "Z46" "b06" "b14"
[121] "b22" "b30" "b38" "b46" "c10" "c18" "c26" "c34" "c42" "d06" "d14" "d22"
[133] "d30" "d38" "d46" "e10" "e18" "e26" "e34" "e42" "K01" "S01" "a01" "B02"
[145] "D04" "G03" "J02" "L04" "O03" "R02" "T04" "W03" "Z02" "b04" "e03" "A11"
[157] "A19" "A27" "A35" "A43" "B07" "B15" "B23" "B31" "B39" "B47" "C11" "C19"
[169] "C27" "C35" "C43" "D07" "D15" "D23" "D31" "D39" "D47" "F47" "H47" "J47"
[181] "L47" "N47" "P47" "R47" "T47" "V47" "X47" "Z47" "b07" "b15" "b23" "b31"
[193] "b39" "b47" "c11" "c19" "c27" "c35" "c43" "d07" "d15" "d23" "d31" "d39"
[205] "d47" "e11" "e19" "e27" "e35" "e43" "L01" "T01" "b01" "B03" "E02" "G04"
[217] "J03" "M02" "O04" "R03" "U02" "W04" "Z03" "c02" "e04" "A12" "A20" "A28"
[229] "A36" "A44" "B08" "B16" "B24" "B32" "B40" "B48" "C12" "C20" "C28" "C36"
[241] "C44" "D08" "D16" "D24" "D32" "D40" "D48" "F48" "H48" "J48" "L48" "N48"
[253] "P48" "R48" "T48" "V48" "X48" "Z48" "b08" "b16" "b24" "b32" "b40" "b48"
[265] "c12" "c20" "c28" "c36" "c44" "d08" "d16" "d24" "d32" "d40" "d48" "e12"
[277] "e20" "e28" "e36" "e44" "M01" "U01" "c01" "B04" "E03" "H02" "J04" "M03"
[289] "P02" "R04" "U03" "X02" "Z04" "c03" "A05" "A13" "A21" "A29" "A37" "A45"
[301] "B09" "B17" "B25" "B33" "B41" "C05" "C13" "C21" "C29" "C37" "C45" "D09"
[313] "D17" "D25" "D33" "D41" "E45" "G45" "I45" "K45" "M45" "O45" "Q45" "S45"
[325] "U45" "W45" "Y45" "a45" "b09" "b17" "b25" "b33" "b41" "c05" "c13" "c21"
[337] "c29" "c37" "c45" "d09" "d17" "d25" "d33" "d41" "e05" "e13" "e21" "e29"
[349] "e37" "e45" "N01" "V01" "d01" "C02" "E04" "H03" "K02" "M04" "P03" "S02"
[361] "U04" "X03" "a02" "c04" "A06" "A14" "A22" "A30" "A38" "A46" "B10" "B18"
[373] "B26" "B34" "B42" "C06" "C14" "C22" "C30" "C38" "C46" "D10" "D18" "D26"
[385] "D34" "D42" "E46" "G46" "I46" "K46" "M46" "O46" "Q46" "S46" "U46" "W46"
[397] "Y46" "a46" "b10" "b18" "b26" "b34" "b42" "c06" "c14" "c22" "c30" "c38"
[409] "c46" "d10" "d18" "d26" "d34" "d42" "e06" "e14" "e22" "e30" "e38" "e46"
[421] "O01" "W01" "e01" "C03" "F02" "H04" "K03" "N02" "P04" "S03" "V02" "X04"
[433] "a03" "d02" "A07" "A15" "A23" "A31" "A39" "A47" "B11" "B19" "B27" "B35"
[445] "B43" "C07" "C15" "C23" "C31" "C39" "C47" "D11" "D19" "D27" "D35" "D43"
[457] "D47" "G47" "I47" "K47" "M47" "O47" "Q47" "S47" "U47" "W47" "Y47" "a47"
[469] "b11" "b19" "b27" "b35" "b43" "c07" "c15" "c23" "c31" "c39" "c47" "d11"
[481] "d19" "d27" "d35" "d43" "e07" "e15" "e23" "e31" "e39" "e47" "P01" "X01"
[493] "A02" "C04" "F03" "I02" "K04" "N03" "Q02" "S04" "V03" "Y02" "a04" "d03"
[505] "A08" "A16" "A24" "A32" "A40" "A48" "B12" "B20" "B28" "B36" "B44" "C08"
[517] "C16" "C24" "C32" "C40" "C48" "D12" "D20" "D28" "D36" "D44" "D48" "G48"
[529] "I48" "K48" "M48" "O48" "Q48" "S48" "U48" "W48" "Y48" "a48" "b12" "b20"
[541] "b28" "b36" "b44" "c08" "c16" "c24" "c32" "c40" "c48" "d12" "d20" "d28"
[553] "d36" "d44" "e08" "e16" "e24" "e32" "e40" "e48"
edgeDat = rep("non edge wells", nrow(rawDatCleaned))
edgeDat[rawDatCleaned$Well %in% edge] = "edge wells"
edge = rawDatCleaned
edge$edge = edgeDat
rm(edgeDat)Visulaizing the differences using both effect size (Fold change) and pvalue.
For none of the plates and conditions, the edge effects exceed the log fold change of +/- 1
# Computing distribution difference statistics
stats = split(edge, paste(edge$Plate, edge$Type, edge$Content, sep = "-"))
stats = t(sapply(stats, function(x) {
y = x$roGFP2.ratio[x$edge == "edge wells"]
n = x$roGFP2.ratio[x$edge == "non edge wells"]
fc = log2(median(y, na.rm = T)/median(n, na.rm = T))
pv = wilcox.test(y, n)$p.value
# cd = effectsize::cohens_d(y, n, pooled_sd = F)$Cohens_d p = signif(p,2)
# return(c(log2fc = fc, wilcoxP = pv, CohenD = cd))
return(c(log2fc = fc, wilcoxP = pv))
}))
a = do.call("rbind", strsplit(rownames(stats), "-"))
colnames(a) = c("Plate", "Nutrient", "Condition")
stats = data.frame(a, stats, row.names = NULL, stringsAsFactors = F)
stats$log2fc = round(stats$log2fc, 2)
stats$wilcoxP = signif(-log10(p.adjust(stats$wilcoxP, method = "fdr")), 2)
# stats$CohenD = round(stats$CohenD, 2)
# Volcano plot
p3 = ggplot(stats, aes(x = log2fc, y = wilcoxP, text = Plate)) + theme_bw(base_size = 8) +
xlim(-1, 1) + labs(y = "-log10(fdr pval)") + geom_point(size = 0.8) + geom_hline(yintercept = -log10(0.05)) +
# geom_vline(xintercept = c(-0.5, 0.5), lty = 2) +
geom_vline(xintercept = c(-1, 1), lty = 1) + facet_wrap(Nutrient ~ Condition, scales = "free") +
geom_text_repel(data = stats[abs(stats$log2fc) > 0.5 & stats$wilcoxP > -log10(0.05),
], aes(label = Plate), size = 2) + theme(panel.grid = element_blank())
ggsave(filename = paste0(path, "analysis/QC/roGFP2_ratio_RAWdata_edge_effects_badPlates.pdf"),
plot = p3, width = 7.5, height = 6)
ggplotly(p3)Variance among controls
Next we look into the variance exhibited by the control colonies across plates and nutrient conditions. Ideally, all control replicates for a given mutant in a given plate and nutrient condition should have similar values i.e variance close to zero. Below we show the variance of raw roGFP2 ratios among controls per plate and group.
ctrlVar = rawDatCleaned %>% filter(Content == "Control") %>% select(Gene.Symbol,
Plate, Type, Group, roGFP2.ratio) %>% group_by(Plate, Type, Group) %>% mutate(Control_variance = round(var(roGFP2.ratio,
na.rm = T), 4)) %>% ungroup() %>% select(Gene.Symbol, Plate, Type, Group, Control_variance) %>%
rename(Genes = Gene.Symbol, Nutrient = Type) %>% mutate(Plate = paste0("P-",
Plate)) %>% distinct()
plot_ly(ctrlVar, x = ~Group, y = ~Plate, z = ~Control_variance, text = ~Genes, type = "scatter3d",
mode = "markers", marker = list(size = 3), opacity = 1, color = ~Nutrient)Next we remove all genes with variance among controls > 0.05
rmv = ctrlVar[which(ctrlVar$Control_variance > 0.05), ]
rmv$Plate = gsub("P-", "", rmv$Plate)
rmv = paste(rmv$Genes, rmv$Plate, rmv$Nutrient, sep = "-")
paste0("Number of poor control genes removed - ", length(rmv))[1] "Number of poor control genes removed - 77"
matchID = paste(rawDatCleaned$Gene.Symbol, rawDatCleaned$Plate, rawDatCleaned$Type,
sep = "-")
rawDatCleaned_ctrlfilt = rawDatCleaned[!matchID %in% rmv, ]
ctrlVar = rawDatCleaned_ctrlfilt %>% filter(Content == "Control") %>% select(Gene.Symbol,
Plate, Type, Group, roGFP2.ratio) %>% group_by(Plate, Type, Group) %>% mutate(Control_variance = round(var(roGFP2.ratio,
na.rm = T), 4)) %>% ungroup() %>% select(Gene.Symbol, Plate, Type, Group, Control_variance) %>%
rename(Genes = Gene.Symbol, Nutrient = Type) %>% mutate(Plate = paste0("P-",
Plate)) %>% distinct()
plot_ly(ctrlVar, x = ~Group, y = ~Plate, z = ~Control_variance, text = ~Genes, type = "scatter3d",
mode = "markers", marker = list(size = 3), opacity = 1, color = ~Nutrient)Next we we visualize the fact that in a good screen, individual control values should not deviate much from the median value of all controls from that same plate. For a given plate \(i\), the normalized control values i.e \(NormCtrl\) is given as - \[NormCtrl_i = \frac{Ctrl_i} {median(Ctrl_i)}\] We plot these scaled control values per plate across conditions below. Ideally these values should be close to 1 (shown as red horizontal lines). A table for these scaled control values is also provided.
ctrlDat = split(rawDatCleaned_ctrlfilt, rawDatCleaned_ctrlfilt$Type)
ctrlDat = lapply(ctrlDat, function(x) {
split(x, x$Plate)
})
for (i in 1:length(ctrlDat)) {
for (j in 1:length(ctrlDat[[i]])) {
tmp = droplevels(ctrlDat[[i]][[j]])
tmp = tmp[, -2]
# Plate control
tmp.ctrl = tmp[tmp$Content == "Control", ]
# Normalizing factor
nf = median(tmp.ctrl$roGFP2.ratio, na.rm = T)
# Measure control value deviation from plate median
ctrlDat[[i]][[j]] = data.frame(Gene = tmp.ctrl$Gene.Symbol, NormControl = round(log2(tmp.ctrl$roGFP2.ratio/nf),
3), stringsAsFactors = F)
# Deleting
rm(tmp, tmp.ctrl, nf)
}
rm(j)
}
rm(i)
ctrlDat = melt(ctrlDat, id = "Gene")
ctrlDat = ctrlDat[, c(4, 5, 1, 3)]
colnames(ctrlDat) = c("Plate", "Nutrient", "Gene", "NormControl")
ctrlDat$Plate = as.factor(ctrlDat$Plate)
ctrlDat$Nutrient = factor(ctrlDat$Nutrient, levels = c("Glucose", "Galactose", "Glycerol"))
# datatable(ctrlDat, rownames = FALSE, filter='top', class='compact', extensions
# = c('Buttons') , options = list(autoWidth = TRUE, dom = 'Blfrtip', buttons =
# c('csv', 'excel') ))
p = ggplot(ctrlDat, aes(x = Plate, y = NormControl)) + theme_bw(base_size = 8) +
labs(y = "Control / Median plate control (roGFP2 ratios) in log2 scale") + geom_boxplot(outlier.size = 0.1,
lwd = 0.2) + facet_wrap(~Nutrient) + geom_hline(yintercept = 3, color = "orange") +
geom_hline(yintercept = -3, color = "orange") + # geom_text_repel(data = subset(ctrlDat, NormControl > 3), aes(x = Plate, y =
# NormControl, label = Gene), size = 2) +
theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5))
pNext we remove those controls that deviate 3 log fold units over the plate control median value
rmv = ctrlDat[which(ctrlDat$NormControl > 3 | ctrlDat$NormControl < -3), ]
rmv = paste(rmv$Gene, rmv$Plate, rmv$Nutrient, sep = "-")
ctrlDat = ctrlDat[!(ctrlDat$NormControl > 3 | ctrlDat$NormControl < -3), ]
paste0("Number of control genes removed which deviated 3 log fold units from the plate median - ",
length(rmv))[1] "Number of control genes removed which deviated 3 log fold units from the plate median - 468"
matchID = paste(rawDatCleaned_ctrlfilt$Gene.Symbol, rawDatCleaned_ctrlfilt$Plate,
rawDatCleaned_ctrlfilt$Type, sep = "-")
rawDatCleaned_ctrlfilt = rawDatCleaned_ctrlfilt[!matchID %in% rmv, ]Plotting these scaled control values -
p = ggplot(ctrlDat) + theme_bw(base_size = 8) + labs(y = "Control / Median plate control (roGFP2 ratios) in log2 scale") +
geom_boxplot(aes(x = Plate, y = NormControl), outlier.size = 0.1, lwd = 0.2) +
facet_wrap(~Nutrient) + # geom_text_repel(data = subset(ctrlDat, NormControl > 3), aes(x = Plate, y =
# NormControl, label = Gene), size = 2) +
theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5))
pA more standard way of calculating the variance among the controls is by computing the robust coefficient of variance see here and here. For a given plate \(i\), this is given as - \[CoefVar_i = \frac{mad(Ctrl_i)}{median(Ctrl_i)}\] mad is Median absolute deviation
Plotting these robust coefficient of variation for control values-
ctrlDat$NormControl = 2^ctrlDat$NormControl
cvDat = split(ctrlDat, ctrlDat$Nutrient)
cvDat = lapply(cvDat, function(x) {
a = split(x, x$Plate)
b = lapply(a, function(y) mad(y$NormControl)/median(y$NormControl))
return(b)
})
cvDat = melt(cvDat)
cvDat = cvDat[, c(2, 3, 1)]
colnames(cvDat) = c("Plate", "Nutrient", "Dev")
cvDat$Plate = as.factor(cvDat$Plate)
cvDat$Nutrient = factor(cvDat$Nutrient, levels = c("Glucose", "Galactose", "Glycerol"))
p = ggplot(cvDat, aes(Plate, Dev)) + theme_bw(base_size = 8) + labs(y = "% of deviation from median plate control roGFP2 ratios") +
geom_bar(stat = "identity") + facet_wrap(~Nutrient) + # geom_hline(yintercept = 0.5, color = 'black') +
geom_hline(yintercept = 1, color = "black") + theme(panel.grid = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5))
ggsave(filename = paste0(path, "analysis/QC/roGFP2_ratio_RAWdata_Controls_Coef_variation.pdf"),
plot = p, width = 12, height = 3.5)
pSaving the data
Finally we save the cleaned raw Data as a .RDS (R data object), which will be used for the next step of normalization
Session information
R version 3.6.2 (2019-12-12)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS 10.16
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggallin_0.1.1 WriteXLS_5.0.0 DT_0.12 plotly_4.9.2
[5] lsr_0.5 ggpubr_0.2.4 magrittr_1.5 ggrepel_0.8.1
[9] ggplot2_3.2.1 reshape2_1.4.3 rmdformats_0.3.6 knitr_1.28
loaded via a namespace (and not attached):
[1] tidyselect_1.0.0 xfun_0.12 purrr_0.3.3 colorspace_1.4-1
[5] vctrs_0.2.2 htmltools_0.4.0 viridisLite_0.3.0 yaml_2.2.1
[9] rlang_0.4.4 pillar_1.4.3 later_1.0.0 glue_1.3.1
[13] withr_2.1.2 RColorBrewer_1.1-2 lifecycle_0.1.0 plyr_1.8.5
[17] stringr_1.4.0 munsell_0.5.0 ggsignif_0.6.0 gtable_0.3.0
[21] htmlwidgets_1.5.1 evaluate_0.14 labeling_0.3 fastmap_1.0.1
[25] Cairo_1.5-11 httpuv_1.5.5 crosstalk_1.0.0 Rcpp_1.0.3
[29] xtable_1.8-4 promises_1.1.0 scales_1.1.0 formatR_1.7
[33] jsonlite_1.6.1 mime_0.9 farver_2.0.3 digest_0.6.23
[37] stringi_1.4.5 bookdown_0.17 dplyr_0.8.4 shiny_1.4.0
[41] grid_3.6.2 tools_3.6.2 lazyeval_0.2.2 tibble_2.1.3
[45] crayon_1.3.4 tidyr_1.0.2 pkgconfig_2.0.3 ellipsis_0.3.0
[49] data.table_1.12.8 assertthat_0.2.1 rmarkdown_2.1 httr_1.4.1
[53] R6_2.4.1 compiler_3.6.2